{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrating NLTK Modin Interoperability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## All the examples in this section are taken / adapted from https://www.kirenz.com/post/2021-12-11-text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import modin.pandas as pd\n",
    "import pandas\n",
    "import nltk\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "from nltk.corpus import stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import some Tweets from Barack Obama \n",
    "modin_df = pd.read_csv(\"https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv\")\n",
    "modin_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df['text'] = modin_df['text'].astype(str).str.lower()\n",
    "modin_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regexp = RegexpTokenizer('\\w+')\n",
    "\n",
    "modin_df['text_token']=modin_df['text'].apply(regexp.tokenize)\n",
    "modin_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a list of english stopwords\n",
    "stopwords = nltk.corpus.stopwords.words(\"english\")\n",
    "\n",
    "# Extend the list with your own custom stopwords\n",
    "my_stopwords = ['https']\n",
    "stopwords.extend(my_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove stopwords\n",
    "modin_df['text_token'] = modin_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])\n",
    "modin_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df['text_string'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))\n",
    "modin_df[['text', 'text_token', 'text_string']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_words = ' '.join([word for word in modin_df['text_string']])\n",
    "tokenized_words = nltk.tokenize.word_tokenize(all_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.probability import FreqDist\n",
    "\n",
    "fdist = FreqDist(tokenized_words)\n",
    "fdist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df['text_string_fdist'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))\n",
    "modin_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lemmatization\n",
    "nltk.download('wordnet')\n",
    "nltk.download('omw-1.4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "wordnet_lem = WordNetLemmatizer()\n",
    "\n",
    "modin_df['text_string_lem'] = modin_df['text_string_fdist'].apply(wordnet_lem.lemmatize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check if the columns are equal\n",
    "modin_df['is_equal']= (modin_df['text_string_fdist']==modin_df['text_string_lem'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# show level count\n",
    "modin_df.is_equal.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_words_lem = ' '.join([word for word in modin_df['text_string_lem']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "wordcloud = WordCloud(width=600, \n",
    "                     height=400, \n",
    "                     random_state=2, \n",
    "                     max_font_size=100).generate(all_words_lem)\n",
    "\n",
    "plt.figure(figsize=(10, 7))\n",
    "plt.imshow(wordcloud, interpolation='bilinear')\n",
    "plt.axis('off');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replicating NLTK workflow with pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import some Tweets from Barack Obama as pandas df\n",
    "pandas_df = pandas.read_csv(\"https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pandas_df['text'] = pandas_df['text'].astype(str).str.lower()\n",
    "pandas_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regexp = RegexpTokenizer('\\w+')\n",
    "\n",
    "pandas_df['text_token']=pandas_df['text'].apply(regexp.tokenize)\n",
    "pandas_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove stopwords\n",
    "pandas_df['text_token'] = pandas_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])\n",
    "pandas_df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pandas_df['text_string'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))\n",
    "pandas_df[['text', 'text_token', 'text_string']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_words = ' '.join([word for word in pandas_df['text_string']])\n",
    "tokenized_words = nltk.tokenize.word_tokenize(all_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.probability import FreqDist\n",
    "\n",
    "fdist = FreqDist(tokenized_words)\n",
    "fdist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pandas_df['text_string_fdist'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))\n",
    "pandas_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "wordnet_lem = WordNetLemmatizer()\n",
    "\n",
    "pandas_df['text_string_lem'] = pandas_df['text_string_fdist'].apply(wordnet_lem.lemmatize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check if the columns are equal\n",
    "pandas_df['is_equal']= (pandas_df['text_string_fdist']==pandas_df['text_string_lem'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# show level count\n",
    "pandas_df.is_equal.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_words_lem = ' '.join([word for word in pandas_df['text_string_lem']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "wordcloud = WordCloud(width=600, \n",
    "                     height=400, \n",
    "                     random_state=2, \n",
    "                     max_font_size=100).generate(all_words_lem)\n",
    "\n",
    "plt.figure(figsize=(10, 7))\n",
    "plt.imshow(wordcloud, interpolation='bilinear')\n",
    "plt.axis('off');"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
